https://developer.imdb.com/non-commercial-datasets/
download.file("https://datasets.imdbws.com/title.basics.tsv.gz", "db/title_basics.tsv.gz", mode = "wb")
data_imdb <- readr::read_delim("db/title_basics.tsv.gz", delim = "\t", na = "\\N", show_col_types = FALSE)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
head(data_imdb, 5)
## # A tibble: 5 × 9
## tconst titleType primaryTitle originalTitle isAdult startYear endYear
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 tt0000001 short Carmencita Carmencita 0 1894 NA
## 2 tt0000002 short Le clown et ses c… Le clown et … 0 1892 NA
## 3 tt0000003 short Pauvre Pierrot Pauvre Pierr… 0 1892 NA
## 4 tt0000004 short Un bon bock Un bon bock 0 1892 NA
## 5 tt0000005 short Blacksmith Scene Blacksmith S… 0 1893 NA
## # ℹ 2 more variables: runtimeMinutes <dbl>, genres <chr>
Use RDS instead of CSV
system.time(
data_imdb <- readr::read_delim("db/title_basics.tsv.gz", delim = "\t", na = "\\N", show_col_types = FALSE)
)
## user system elapsed
## 12.779 2.588 14.326
data_imdb
## # A tibble: 10,375,265 × 9
## tconst titleType primaryTitle originalTitle isAdult startYear endYear
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 tt0000001 short Carmencita Carmencita 0 1894 NA
## 2 tt0000002 short Le clown et ses … Le clown et … 0 1892 NA
## 3 tt0000003 short Pauvre Pierrot Pauvre Pierr… 0 1892 NA
## 4 tt0000004 short Un bon bock Un bon bock 0 1892 NA
## 5 tt0000005 short Blacksmith Scene Blacksmith S… 0 1893 NA
## 6 tt0000006 short Chinese Opium Den Chinese Opiu… 0 1894 NA
## 7 tt0000007 short Corbett and Cour… Corbett and … 0 1894 NA
## 8 tt0000008 short Edison Kinetosco… Edison Kinet… 0 1894 NA
## 9 tt0000009 movie Miss Jerry Miss Jerry 0 1894 NA
## 10 tt0000010 short Leaving the Fact… La sortie de… 0 1895 NA
## # ℹ 10,375,255 more rows
## # ℹ 2 more variables: runtimeMinutes <dbl>, genres <chr>
system.time(
results <- data_imdb |>
dplyr::group_by(startYear) |>
dplyr::summarize(number_movies = n()) |>
arrange(number_movies)
)
## user system elapsed
## 0.267 0.040 0.310